Load library
library(Seurat)
## Attaching SeuratObject
library(SeuratObject)
library(future)
library(ggprism)
library(patchwork)
library(ggplot2)
Data prepare
### Import the data after removing unwanted cells
options(future.globals.maxSize = 10 * 1024 ^ 3)
plan("multiprocess", workers = 10)
## Warning: Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead,
## explicitly specify either 'multisession' or 'multicore'. In the current R
## session, 'multiprocess' equals 'multisession'.
## Warning in supportsMulticoreAndRStudio(...): [ONE-TIME WARNING] Forked
## processing ('multicore') is not supported when running R from RStudio
## because it is considered unstable. For more details, how to control forked
## processing or not, and how to silence this warning in future R sessions, see ?
## parallelly::supportsMulticore
pbmc <- readRDS("../04_data_objects/SeuratObject_27subjects_ncounts_nfeatures_mito_3MAD.RDS")
### Normalizing the data
plan("multiprocess", workers = 10)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
### Identification of highly variable features (modeling the mean-variance relationship)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
#### Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
#### plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 7124 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 7124 rows containing missing values (geom_point).

### Scaling the data
all.genes <- rownames(pbmc)
plan("multiprocess", workers = 10)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
Perform linear dimensional reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), npcs = 50)
## PC_ 1
## Positive: HLA-DRA, CD74, LYZ, HLA-DRB1, HLA-DPB1, HLA-DPA1, HLA-DQB1, C1QA, HLA-DQA1, C1QB
## CD14, C1QC, FTL, ITGB2, GPR183, HLA-DRB5, VSIG4, MARCO, CD83, HLA-DMB
## PLAUR, RNASE1, CTSB, S100A9, FCGR3A, CXCR4, HCST, CPVL, FOLR2, PLEK
## Negative: COL1A2, PLAC9, C1R, C1S, COL6A2, LUM, PCOLCE, CCDC80, NNMT, MGP
## DCN, CALD1, FSTL1, CLU, CTGF, TIMP3, SPARC, SELM, COL3A1, EFEMP1
## COL6A1, PLA2G2A, BGN, PRG4, COL6A3, COL1A1, ISLR, CAV1, DPT, CYR61
## PC_ 2
## Positive: ACKR1, ECSCR.1, TM4SF1, PLVAP, VWF, CLEC14A, ADGRL4, EMCN, PCAT19, ESAM
## BCAM, CLDN5, NRN1, NPDC1, JAM2, GNG11, PECAM1, RAMP2, CYYR1, C2CD4B
## MMRN2, ADAMTS9, CXorf36, AQP1, TSPAN7, NOSTRIN, SELE, SNCG, TM4SF18, CCL14
## Negative: FTL, CFD, PLTP, FN1, COL1A2, PCOLCE, TIMP1, GPNMB, LUM, C1R
## SERPINF1, C1S, PLA2G2A, COL3A1, CTSL, CTSD, CCDC80, LGALS3BP, COL6A3, SDC2
## CTSB, ISLR, MMP2, COL6A2, COL1A1, LYZ, C1QA, TNFAIP6, APOE, EFEMP1
## PC_ 3
## Positive: MFAP5, FBLN1, SERPINF1, ASPN, FBLN2, SFRP4, THY1, IGF1, PODN, AEBP1
## SRPX, FBLN5, PDGFRL, CRABP2, SFRP2, OLFML3, C3, FAM180B, SFRP1, VCAN
## MFAP4, MAP1B, FNDC1, CXCL14, MEST, GSN, PRELP, CFD, DCN, MGST1
## Negative: CRTAC1, CLIC5, DIRC3, ITGBL1, SEMA5A, ITGB8, ERRFI1, PRG4, GABRA4, GPR1
## PPP1R1A, DNASE1L3, TMEM196, MMP3, NDUFA4L2, HTRA1, OLFM3, NTN4, ANK3, MT2A
## HBEGF, BTC, GJB2, IGFBP5, FGF10, SOX5, MTUS2, ZNF385B, TWISTNB, AK1
## PC_ 4
## Positive: TRAC, IL32, CD3D, TRBC2, CD52, LTB, GZMA, CD2, CD3E, CCL5
## LCK, CD3G, TRBC1, IL2RG, NKG7, CTSW, SPOCK2, CST7, CD7, RHOH
## KLRB1, CLEC2D, GZMM, DUSP2, RAC2, CORO1A, ICAM3, CD247, ISG20, TUBA4A
## Negative: RNASE1, C1QA, C1QB, FTL, C1QC, MARCO, CTSB, NUPR1, FOLR2, IFI27
## PLTP, CTSD, LGMN, SEPP1, GPNMB, VSIG4, HSPB1, FCGR3A, CD14, MS4A4A
## CD163, TREM2, FN1, ALDH1A1, CSTB, HLA-DRB1, HLA-DRA, APOC1, LILRB5, CD74
## PC_ 5
## Positive: ACTA2, MYH11, PLN, RERGL, PPP1R14A, TPM2, ACTG2, CNN1, LMOD1, NOTCH3
## RCAN2, FRZB, TAGLN, MCAM, MYLK, RGS5, GUCY1A3, COX4I2, MAP1B, MYL9
## CCDC3, TPM1, TINAGL1, CDH6, EFHD1, CASQ2, FHL5, WFDC1, PTP4A3, HES4
## Negative: VCAN, EFEMP1, CD74, PTGDS, CHI3L2, SOD2, MDK, TNFSF10, ACKR1, CXCL12
## CHI3L1, DPT, IGFBP4, PLVAP, SFRP1, WISP2, LY6E, AKR1C3, MMP2, HLA-DRB1
## ECSCR.1, RAMP3, VWF, VCAM1, CP, ADGRL4, CFI, SMOC1, PLA2G2A, HLA-DRA
VizDimLoadings(pbmc, dims = 1:5, reduction = "pca")

pca.p1 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "orig.ident",
pt.size = 1)
#cols = ggprism_data$colour_palettes$floral2[1:12])
pca.p2 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "group_id",
pt.size = 1)
# cols = ggprism_data$colour_palettes$prism_dark2[1:4])
pca.p3 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "cohort",
pt.size = 1)
pca.p1 | pca.p2 | pca.p3

ggsave("PCA.pdf", width = 20, height = 5)
DimHeatmap(pbmc, dims = 1:15, cells = 1000, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset
### Determine statistical significance of PCA scores.
# plan("multiprocess", workers = 20)
# pbmc <- JackStraw(pbmc, dims = 20, num.replicate = 100)
#
# ### Compute Jackstraw scores significance.
# pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
Cluster the cells
pbmc <- FindNeighbors(pbmc, dims = 1:50)
## Computing nearest neighbor graph
## Computing SNN
plan("multiprocess", workers = 20)
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 59845
## Number of edges: 2244654
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9437
## Number of communities: 31
## Elapsed time: 15 seconds
## 1 singletons identified. 30 final clusters.
## Warning: UNRELIABLE VALUE: One of the 'future.apply' iterations
## ('future_lapply-1') unexpectedly generated random numbers without declaring so.
## There is a risk that those random numbers are not statistically sound and the
## overall results might be invalid. To fix this, specify 'future.seed=TRUE'. This
## ensures that proper, parallel-safe random numbers are produced via the L'Ecuyer-
## CMRG method. To disable this check, use 'future.seed = NULL', or set option
## 'future.rng.onMisuse' to "ignore".
pbmc <- RunUMAP(pbmc, dims = 1:50)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:27:06 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:27:06 Read 59845 rows and found 50 numeric columns
## 16:27:06 Using Annoy for neighbor search, n_neighbors = 30
## 16:27:06 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:27:14 Writing NN index file to temp file /tmp/RtmpAo7VXO/file2be0343d013350
## 16:27:14 Searching Annoy index using 20 threads, search_k = 3000
## 16:27:15 Annoy recall = 100%
## 16:27:15 Commencing smooth kNN distance calibration using 20 threads
## 16:27:17 Initializing from normalized Laplacian + noise
## 16:27:21 Commencing optimization for 200 epochs, with 2661832 positive edges
## 16:27:45 Optimization finished
umap.p1 <- DimPlot(pbmc, reduction = "umap", pt.size = 1)
umap.p2 <- DimPlot(pbmc, reduction = "umap",
group.by = "group_id", pt.size = 1)
umap.p3 <- DimPlot(pbmc, reduction = "umap",group.by = "group_id",
split.by = "group_id",pt.size = 1)
(umap.p1 | umap.p2) /
umap.p3

ggsave("Clustering_UMAP.pdf", width = 11, height = 8)
DimPlot(pbmc, reduction = "umap",
group.by = "orig.ident", pt.size = 1)

DimPlot(pbmc, reduction = "umap",
split.by = "cohort", pt.size = 1)

#DimPlot(pbmc, reduction = "umap",
# split.by = "Sample",pt.size = 1)
#saveRDS(pbmc, file = "SeuratObject_27subjects_ncounts_nfeatures_mito_3MAD_PCA_clustering.RDS")
DimPlot(pbmc, reduction = "umap",
split.by = "orig.ident",pt.size = 1,ncol = 7)

ggsave("clustering_umap_by_sample.pdf", width = 14,height = 10)
SCTransform
options(future.globals.maxSize = 10 * 1024 ^ 3)
plan("multiprocess", workers = 10)
pbmc <- readRDS("../04_data_objects/SeuratObject_27subjects_ncounts_nfeatures_mito_3MAD.RDS")
pbmc1 <- pbmc
pbmc1 <- SCTransform(pbmc1, assay = "RNA", method = "glmGamPoi")
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 22737 by 59845
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 72 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 22737 genes
##
|
| | 0%
|
|== | 2%
|
|=== | 4%
|
|===== | 7%
|
|====== | 9%
|
|======== | 11%
|
|========= | 13%
|
|=========== | 15%
|
|============ | 17%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|======================= | 33%
|
|======================== | 35%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================= | 41%
|
|============================== | 43%
|
|================================ | 46%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|=================================================================== | 96%
|
|==================================================================== | 98%
|
|======================================================================| 100%
## Computing corrected count matrix for 22737 genes
##
|
| | 0%
|
|== | 2%
|
|=== | 4%
|
|===== | 7%
|
|====== | 9%
|
|======== | 11%
|
|========= | 13%
|
|=========== | 15%
|
|============ | 17%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|======================= | 33%
|
|======================== | 35%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================= | 41%
|
|============================== | 43%
|
|================================ | 46%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|=================================================================== | 96%
|
|==================================================================== | 98%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 3.93477 mins
## Determine variable features
## Place corrected count matrix in counts slot
## Centering data matrix
## Set default assay to SCT
pbmc <- pbmc1
## Perform linear dimensional reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), npcs = 50)
## PC_ 1
## Positive: PLA2G2A, PRG4, DCN, CLU, LUM, MGP, COL3A1, COL1A2, CTGF, PTGDS
## CRTAC1, CXCL12, IGFBP5, C1R, GSN, COL1A1, HTRA1, IGFBP6, MMP3, FN1
## PCOLCE, EFEMP1, C1S, CCDC80, TIMP3, TNFAIP6, MFAP5, CFD, PLAC9, COL6A2
## Negative: FTL, HLA-DRA, CD74, LYZ, C1QA, C1QB, RNASE1, S100A8, HLA-DRB1, FTH1
## S100A9, HLA-DPB1, HLA-DPA1, HLA-DRB5, C1QC, CXCL8, TYROBP, MARCO, HLA-DQA1, TMSB10
## FCER1G, HLA-DQB1, AIF1, ACTB, CCL3, SRGN, APOC1, TMSB4X, SPP1, MT-CO1
## PC_ 2
## Positive: PRG4, MMP3, FN1, CRTAC1, CLU, HTRA1, MT2A, TIMP1, MMP1, IGFBP5
## PLA2G2A, FTL, ERRFI1, TNFAIP6, MT1X, ITGBL1, TIMP3, NDUFA4L2, GPR1, CXCL1
## CLIC5, MT1G, DIRC3, S100A4, DEFB1, GPX3, HAS1, C11orf96, PCOLCE, TWISTNB
## Negative: DCN, IGFBP7, TM4SF1, ACKR1, CCL2, GSN, TAGLN, MFAP5, SFRP4, ADIRF
## CXCL14, APOD, ACTA2, ASPN, COL1A1, IGFBP6, CXCL12, FABP4, FBLN1, CLDN5
## PLVAP, COMP, FBLN2, SELE, SFRP2, MYL9, VWF, ECSCR.1, CFD, GNG11
## PC_ 3
## Positive: TM4SF1, ACKR1, PRG4, SPARCL1, IGFBP7, ADIRF, CLDN5, CRTAC1, PLVAP, FABP4
## SELE, MMP3, CAV1, VWF, ECSCR.1, GNG11, CLEC14A, CCL2, ACTA2, SOX17
## ADGRL4, AQP1, CLU, RGS16, HTRA1, FN1, TAGLN, RND1, RAMP2, C2CD4B
## Negative: DCN, FTL, LUM, CFD, MFAP5, PLA2G2A, COL3A1, COL1A1, IGFBP6, SFRP4
## GSN, APOE, ASPN, COL1A2, CXCL12, C1QA, C1QB, COMP, CXCL14, DPT
## HLA-DRA, RNASE1, FBLN1, S100A8, WISP2, FTH1, CD74, APOC1, APOD, CST3
## PC_ 4
## Positive: IL32, CCL5, NKG7, CD52, LTB, TRAC, GZMA, GNLY, CXCR4, CD3D
## KLRB1, GZMB, TRBC2, IGKC, TRBC1, JCHAIN, MZB1, CST7, CTSW, GZMH
## FCER1A, CREM, CD2, IGHG1, DUSP2, CD69, IGHG3, GZMK, CD7, FCN1
## Negative: FTL, RNASE1, C1QA, C1QB, APOE, IGFBP7, APOC1, TM4SF1, C1QC, MARCO
## TAGLN, CCL2, SPARCL1, ACKR1, ADIRF, ACTA2, IFI27, S100A6, MYL9, SEPP1
## S100A4, TPM2, PRG4, FOLR2, LGMN, CFD, MYH11, FABP4, CCL18, NUPR1
## PC_ 5
## Positive: MFAP5, IGFBP6, PRG4, SFRP4, TAGLN, PI16, ACTA2, PCOLCE2, IGFBP5, ASPN
## CRTAC1, MYL9, GSN, FBN1, TPPP3, TPM2, COL1A1, MYH11, FBLN2, RERGL
## TNXB, SPARCL1, ADIRF, DCN, S100A4, CPE, S100A6, CRABP2, PRELP, HTRA3
## Negative: PTGDS, CXCL12, APOE, CLU, CHI3L1, CHI3L2, CTGF, EFEMP1, CCL2, TNFAIP6
## RARRES2, SFRP1, VCAM1, COL14A1, CYR61, C1R, NNMT, LUM, IGFBP4, CP
## JUN, MDK, PLA2G2A, FGF7, C7, EGR1, SCRG1, ACKR1, C1S, ADAMTS1
VizDimLoadings(pbmc, dims = 1:5, reduction = "pca")

pca.p1 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "orig.ident",
pt.size = 1)
#cols = ggprism_data$colour_palettes$floral2[1:12])
pca.p2 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "group_id",
pt.size = 1)
# cols = ggprism_data$colour_palettes$prism_dark2[1:4])
pca.p3 <- DimPlot(object = pbmc,
reduction = "pca",
group.by = "cohort",
pt.size = 1)
pca.p1 | pca.p2 | pca.p3

ggsave("PCA_sctransform.pdf", width = 20, height = 5)
DimHeatmap(pbmc, dims = 1:15, cells = 1000, balanced = TRUE)

Cluster the cells
pbmc <- FindNeighbors(pbmc, dims = 1:50)
## Computing nearest neighbor graph
## Computing SNN
plan("multiprocess", workers = 20)
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 59845
## Number of edges: 2066347
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9308
## Number of communities: 28
## Elapsed time: 20 seconds
pbmc <- RunUMAP(pbmc, dims = 1:50)
## 16:36:18 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:36:19 Read 59845 rows and found 50 numeric columns
## 16:36:19 Using Annoy for neighbor search, n_neighbors = 30
## 16:36:19 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:36:27 Writing NN index file to temp file /tmp/RtmpAo7VXO/file2be03470a98009
## 16:36:27 Searching Annoy index using 20 threads, search_k = 3000
## 16:36:29 Annoy recall = 100%
## 16:36:29 Commencing smooth kNN distance calibration using 20 threads
## 16:36:32 Initializing from normalized Laplacian + noise
## 16:36:36 Commencing optimization for 200 epochs, with 2566770 positive edges
## 16:37:01 Optimization finished
umap.p1 <- DimPlot(pbmc, reduction = "umap", pt.size = 1)
umap.p2 <- DimPlot(pbmc, reduction = "umap",
group.by = "group_id", pt.size = 1)
umap.p3 <- DimPlot(pbmc, reduction = "umap",group.by = "group_id",
split.by = "group_id",pt.size = 1)
(umap.p1 | umap.p2) /
umap.p3

ggsave("Clustering_UMAP_sctransform.pdf", width = 11, height = 8)
DimPlot(pbmc, reduction = "umap",
group.by = "orig.ident", pt.size = 1)

DimPlot(pbmc, reduction = "umap",
split.by = "cohort", pt.size = 1)

#saveRDS(pbmc, file = "SeuratObject_27subjects_ncounts_nfeatures_mito_3MAD_PCA_clustering_sctransform.RDS")
DimPlot(pbmc, reduction = "umap",
split.by = "orig.ident",pt.size = 1,ncol = 7)

ggsave("clustering_umap_by_sample_sctransform.pdf", width = 14,height = 10)